# install.packages("tidyverse")
# install.packages("timetk")
library(tidyverse)
library(readr) #read_csv()
library(dplyr)
library(glmnet)
library(caret) # may not need anymore
library(timetk)
library(rsample)
# Load data:
df <- read_csv("./data/walmart/Walmart.csv")
head(df)
Fix Datatypes, Sort Values
# Fixing datatypes:
# any(grepl("/", df$Date))
print(sapply(df, class))
Store Date Weekly_Sales Holiday_Flag Temperature Fuel_Price CPI Unemployment
"numeric" "character" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
df$Date <- gsub("/", "-", df$Date) # replace slashes with dashes, so all dates follow same format
df$Date <- as.Date(df$Date, format = "%d-%m-%Y") # NOTE european-like date: day/month/year
print(sapply(df, class))
Store Date Weekly_Sales Holiday_Flag Temperature Fuel_Price CPI Unemployment
"numeric" "Date" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
# Sort values:
df <- df[order(df$Date, decreasing = FALSE), ] # gets an ordering of rows based on date and then df[ix] to select that order
head(df)
print(dim(df))
[1] 6435 8
table(df$Date) # value_counts
2010-02-05 2010-02-12 2010-02-19 2010-02-26 2010-03-05 2010-03-12 2010-03-19 2010-03-26 2010-04-02 2010-04-09 2010-04-16 2010-04-23 2010-04-30 2010-05-07 2010-05-14 2010-05-21 2010-05-28 2010-06-04
45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45
2010-06-11 2010-06-18 2010-06-25 2010-07-02 2010-07-09 2010-07-16 2010-07-23 2010-07-30 2010-08-06 2010-08-13 2010-08-20 2010-08-27 2010-09-03 2010-09-10 2010-09-17 2010-09-24 2010-10-01 2010-10-08
45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45
2010-10-15 2010-10-22 2010-10-29 2010-11-05 2010-11-12 2010-11-19 2010-11-26 2010-12-03 2010-12-10 2010-12-17 2010-12-24 2010-12-31 2011-01-07 2011-01-14 2011-01-21 2011-01-28 2011-02-04 2011-02-11
45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45
2011-02-18 2011-02-25 2011-03-04 2011-03-11 2011-03-18 2011-03-25 2011-04-01 2011-04-08 2011-04-15 2011-04-22 2011-04-29 2011-05-06 2011-05-13 2011-05-20 2011-05-27 2011-06-03 2011-06-10 2011-06-17
45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45
2011-06-24 2011-07-01 2011-07-08 2011-07-15 2011-07-22 2011-07-29 2011-08-05 2011-08-12 2011-08-19 2011-08-26 2011-09-02 2011-09-09 2011-09-16 2011-09-23 2011-09-30 2011-10-07 2011-10-14 2011-10-21
45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45
2011-10-28 2011-11-04 2011-11-11 2011-11-18 2011-11-25 2011-12-02 2011-12-09 2011-12-16 2011-12-23 2011-12-30 2012-01-06 2012-01-13 2012-01-20 2012-01-27 2012-02-03 2012-02-10 2012-02-17 2012-02-24
45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45
2012-03-02 2012-03-09 2012-03-16 2012-03-23 2012-03-30 2012-04-06 2012-04-13 2012-04-20 2012-04-27 2012-05-04 2012-05-11 2012-05-18 2012-05-25 2012-06-01 2012-06-08 2012-06-15 2012-06-22 2012-06-29
45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45
2012-07-06 2012-07-13 2012-07-20 2012-07-27 2012-08-03 2012-08-10 2012-08-17 2012-08-24 2012-08-31 2012-09-07 2012-09-14 2012-09-21 2012-09-28 2012-10-05 2012-10-12 2012-10-19 2012-10-26
45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45
There are 45 stores, so this makes sense, that there are 45 entries for each date. Also, each date represents one week.
Normalize weekly sales within each store, so that no one store dominates, and weekly sales are comparable: i.e. (sales_store - mean(sales_store)) / std(sales_store) and can convert back later via (scaled_sales * store_std) + store_mean
df <- df %>%
group_by(Store) %>%
mutate(
sales_mean = mean(Weekly_Sales, na.rm = TRUE),
sales_std = sd(Weekly_Sales, na.rm = TRUE),
weekly_sales_scaled = (Weekly_Sales - sales_mean) / sales_std
) %>%
ungroup()
head(df)
for (store_id in 1:length(unique(df$Store))) {
p <- df %>%
filter(Store == store_id) %>%
ggplot(aes(x = Date, y = weekly_sales_scaled)) +
geom_line(color = "steelblue") +
labs(title = paste("Weekly Sales Over Time - Store", store_id),
x = "Date",
y = "Weekly Sales (scaled per store)")
print(p)
}
NA
NA
# (Can't randomize the data first, because it is time-series)
# Train/Test Split:
split_ix <- floor(.8*nrow(df))
train_df <- df[1:split_ix,]
test_df <- df[(split_ix+1):nrow(df),]
print(dim(train_df))
[1] 5148 11
print(dim(test_df))
[1] 1287 11
cat("train:", format(min(train_df$Date), "%Y-%m-%d"), " to ", format(max(train_df$Date), "%Y-%m-%d"),"\n")
train: 2010-02-05 to 2012-04-13
cat("test:", format(min(test_df$Date), "%Y-%m-%d"), " to ", format(max(test_df$Date), "%Y-%m-%d"),"\n")
test: 2012-04-13 to 2012-10-26
print(colnames(df))
[1] "Store" "Date" "Weekly_Sales"
[4] "Holiday_Flag" "Temperature" "Fuel_Price"
[7] "CPI" "Unemployment" "sales_mean"
[10] "sales_std" "weekly_sales_scaled"
# Define design matrix pattern (to make it easier to form X's during cross val):
design_matrix_formula <- weekly_sales_scaled ~ . - Store - Date - Weekly_Sales - sales_mean - sales_std + 0 # 0 means no intercept, glmnet will add its own; here we drop columns
# model.matrix creates a design matrix from formula, auto-encodes categorical as dummy vars (if any--in this case, holiday is already a double.)
X_test <- model.matrix(design_matrix_formula, data=test_df)
y_test <- test_df$weekly_sales_scaled
print(dim(X_test))
[1] 1287 5
print(X_test[1:10,]) # see example of columns left for prediction
Holiday_Flag Temperature Fuel_Price CPI Unemployment
1 0 44.42 4.187 137.8680 8.150
2 0 45.68 4.044 214.3127 7.139
3 0 69.03 3.891 221.1484 6.891
4 0 49.89 4.025 141.8434 7.671
5 0 41.81 4.025 137.8680 4.125
6 0 48.65 4.187 137.8680 8.983
7 0 42.46 4.044 214.3127 7.139
8 0 36.90 4.025 137.8680 7.489
9 0 52.22 4.187 141.8434 8.253
10 0 64.28 4.254 131.1080 11.627
Because, it is important to keep the future in the test, or else it could cheat and accidentally learn the past and use it to predict the past, some inherent pattern. i.e. could use info from the future to predict the past (leakage).
cv_folds <- time_series_cv(
data = train_df,
date_var = Date,
cumulative = FALSE, # no growing window (same size folds)
initial = "6 months", # length of train window
assess = "3 months", # length of validation window
skip = "3 months", # jump between folds
slice_limit = 5 # num folds max
)
plot_time_series_cv_plan(cv_folds, .date_var = Date, .value = weekly_sales_scaled, .interactive=TRUE)
# === Define Lambda search grid ===
# Easy way to get lambda grid with good scaling--from the glmnet function itself.
# Note that we will not keep this model fit, this is solely for the purpose of getting
# a lambda grid for our manual CV.
unused_X_all <- model.matrix(design_matrix_formula, data=train_df)
unused_y_all <- train_df$weekly_sales_scaled
unused_model <- glmnet(unused_X_all, unused_y_all, alpha=1)
lambda_grid <- unused_model$lambda
lambda_grid
[1] 0.1450961864 0.1322062411 0.1204614030 0.1097599440 0.1000091731 0.0911246338
[7] 0.0830293725 0.0756532718 0.0689324437 0.0628086754 0.0572289258 0.0521448657
[13] 0.0475124596 0.0432915836 0.0394456786 0.0359414333 0.0327484954 0.0298392093
[19] 0.0271883762 0.0247730358 0.0225722676 0.0205670095 0.0187398931 0.0170750926
[25] 0.0155581885 0.0141760419 0.0129166814 0.0117691990 0.0107236558 0.0097709958
[31] 0.0089029675 0.0081120524 0.0073914000 0.0067347684 0.0061364701 0.0055913230
[37] 0.0050946053 0.0046420146 0.0042296308 0.0038538821 0.0035115138 0.0031995606
[43] 0.0029153205 0.0026563314 0.0024203503 0.0022053330 0.0020094173 0.0018309062
[49] 0.0016682536 0.0015200505 0.0013850134 0.0012619726 0.0011498625 0.0010477119
[55] 0.0009546360 0.0008698288 0.0007925556 0.0007221471 0.0006579936 0.0005995392
[61] 0.0005462778 0.0004977480 0.0004535294 0.0004132391
n_folds <- length(cv_folds$splits)
n_lambdas <- length(lambda_grid)
cv_mses <- numeric(n_lambdas) # keep track of MSE across different lambdas
lambda_ix <- 1
for (lambda in lambda_grid) {
fold_mses <- numeric(n_folds) # array for keeping track of mse across folds
for (fold_ix in 1:n_folds) {
# Get train, valid data for current cv fold:
fold <- cv_folds$splits[[fold_ix]]
train_data <- analysis(fold)
valid_data <- assessment(fold)
# Manipulate data into design matrices:
X_train <- model.matrix(design_matrix_formula, data=train_data)
y_train <- train_data$weekly_sales_scaled
X_val <- model.matrix(design_matrix_formula, data=valid_data)
y_val <- valid_data$weekly_sales_scaled
# Fit LASSO (non-cv version):
lasso_model <- glmnet(X_train, y_train, alpha=1, lambda=lambda)
# Get val error:
yhat_val <- predict(lasso_model, newx=X_val, s=lambda)
fold_mses[fold_ix] <- mean((y_val - yhat_val)^2)
}
cv_mses[lambda_ix] <- mean(fold_mses)
lambda_ix <- lambda_ix + 1
}
best_lambda <- lambda_grid[which.min(cv_mses)]
cat("best lambda: ", best_lambda, "with min mse ", min(cv_mses), "\n\n")
best lambda: 0.1450962 with min mse 1.083355
plot(x=lambda_grid,
y=cv_mses,
main = paste0(n_folds, "-Fold Cross-Validation MSE for different Lambdas"),
xlab = "Lambda",
ylab = "Mean CV MSE")
#best_ix <- which.min(cv_mses)
#points(best_lambda, cv_mses[best_ix], col = "red", pch = 19, cex = 1.5)
#text(best_lambda, cv_mses[best_ix],
# labels = paste("Best λ =", round(best_lambda, 4)),
# pos = 4, col = "red")
X_train <- model.matrix(design_matrix_formula, data=train_df)
y_train <- train_df$weekly_sales_scaled
best_lasso_model <- glmnet(X_train, y_train, alpha=1, lambda=best_lambda)
# Get val error:
yhat_test <- predict(best_lasso_model, newx=X_test, s=best_lambda)
mse_test <- mean((y_test - yhat_test)^2)
cat("Test MSE: ", mse_test, "\n\n")
Test MSE: 0.4062607
coef(best_lasso_model)
6 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) -0.002841935
Holiday_Flag 0.000000000
Temperature .
Fuel_Price .
CPI .
Unemployment .